#!/usr/bin/env ruby
# -*- coding: us-ascii -*-

module HTMLStripper
  TITLE_RE = %r'<title[^>]*?>(.*?)</title[^>]*?>'im
  BODY_RE = %r'<body[^>]*?>(.*?)</body[^>]*?>'im
  SCRIPT_RE = %r'<script[^>]*?>.*?</script[^>]*?>'im
  STYLE_RE = %r'<style[^>]*?>.*?</style[^>]*?>'im
  START_TAG_RE = %r'<!?\w+[^>]*?>'im  # cheating here for DOCTYPE :-(
  END_TAG_RE = %r'</\w+[^>]*?>'im
  
  def HTMLStripper.title(html)
    html.to_str =~ TITLE_RE
    $1 ? $1.strip : ''
  end
  
  def HTMLStripper.body(html)
    html.to_str =~ BODY_RE
    $1 ? $1.strip : ''
  end
  
  def HTMLStripper.delete_scripts(html)
    html.to_str.gsub(SCRIPT_RE, '')
  end
  
  def HTMLStripper.delete_styles(html)
    html.to_str.gsub(STYLE_RE, '')
  end
  
  def HTMLStripper.delete_tags(html)
    html.to_str.gsub(START_TAG_RE, '').gsub(END_TAG_RE, '')
  end
  
  def HTMLStripper.body_text(html)
    bod = HTMLStripper.body(html)
    nojs = HTMLStripper.delete_scripts(bod)
    HTMLStripper.delete_tags(nojs)
  end
  
  def HTMLStripper.all_text(html)
    nocs = HTMLStripper.delete_styles(html)
    nojs = HTMLStripper.delete_scripts(nocs)
    HTMLStripper.delete_tags(nojs)
  end
end

########################################

# if $0 == __FILE__
#   start = %Q'<HTML><HEAD><TITLE \r\LANG="FR">\r\nBonjour!\r\n</TITLE>'
#   
#   title = HTMLStripper.title(start)
#   raise 'bad title' unless title == 'Bonjour!'
#   
#   icky = <<-ICKY
#     </head>
#     <body link=#0000FF\nvlink=#FF00FF>
#     <script\nlanguage="Javascript">
#     if (5 > 3) { document.write("5 > 3"); }
#     </script>
#     <h1>Greetings</h1>
#     <p>
#     <script>\ndocument.write("Hello, moron!");\n</script>
#     Hello, friend!
#     </p>
#     </body>
#     </html>
#   ICKY
#   
#   body = HTMLStripper.body(icky)
#   raise 'bad body' if body.include? '</head>'
#   raise 'bad body' if body.include? '<body'
#   raise 'bad body' if body.include? '00FF'
#   raise 'bad body' if body.include? '</body>'
#   raise 'bad body' if body.include? '</html>'
#   
#   nojs = HTMLStripper.delete_scripts(body)
#   raise 'bad delete_scripts' if nojs.include? '<script'
#   raise 'bad delete_scripts' if nojs.include? '</script>'
#   
#   notags = HTMLStripper.delete_tags(nojs)
#   raise 'bad delete_tags' if notags.include? 'h1'
#   raise 'bad delete_tags' if notags.include? 'p>'
#   
#   text = HTMLStripper.all_text(start + icky)
#   raise 'bad all_text' if text.include? '<'
#   raise 'bad all_text' if text.include? '>'
#   raise 'bad all_text' if text.include? 'Hello, moron!'
#   raise 'bad all_text' unless text.include? 'Bonjour!'
#   raise 'bad all_text' unless text.include? 'Greetings'
#   raise 'bad all_text' unless text.include? 'Hello, friend!'
#   
#   puts 'all little tests pass :-)'
# end
